package KnowSparkSQL.DSL.DuplicateRemoval;


import KnowSparkSQL.DSL.MostValueAVGCountJoin.Demo;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

/**
 * Dataset<Row>.
 * distinct()删除Dataset<Row>中重复行，
 * 并返回一个Dataset<Row>类型的对象。
 * 不需要传递参数。代码演示如下（两个图合在一起是完整代码）：
 */
public class DatasetDistinct {
    public static void main(String[] args) {
        SparkSession sparkSession = SparkSession.builder().appName(Demo.class.getName()).master("local").getOrCreate();

        Dataset<Row> json = sparkSession.read().json("./data/student1.json");

        json.selectExpr("name","age","sex","institute","phone").show();
        json.selectExpr("name","age","sex","institute","phone").distinct().show();

    }
}

